In [53]:
import pandas as pd
In [75]:
# load both sheets as new dataframes
shows_df = pd.read_csv("show_category.csv")
views_df = pd.read_excel("views.xls")
In [76]:
shows_df.head()
Out[76]:
In [77]:
shows_df = shows_df.set_index('showname')
shows_df.head()
Out[77]:
In [78]:
views_df.head()
Out[78]:
In [79]:
views_df = views_df.set_index('viewer_id')
views_df.head() # note that we can have repeating viewer_id values (they're non-unique)
Out[79]:
In [80]:
# we can select out the column to work on, then use the built-in str (string) functions
# to replace hyphens (we do this and just print the result to screen)
views_df['show_watched'].str.replace("-", "")
Out[80]:
In [81]:
# now we do the fix in-place
views_df['show_watched'] = views_df['show_watched'].str.replace("-", "")
# NOTE if you comment out the line above, you'll get a NaN in the final table
# as `battle-star` won't be joined
views_df
Out[81]:
In [82]:
print("Index info:", views_df.index)
views_df.ix[22] # select the items with index 22 (note this is an integer, not string value)
Out[82]:
In [83]:
shows_views_df = views_df.join(shows_df, on='show_watched')
shows_views_df
Out[83]:
In [84]:
# take out two relevant columns, group by category, sum the views
shows_views_df[['views', 'category']].groupby('category').sum()
Out[84]: